Data setup

Read in source data.

library(tidyverse)
library(mrcommons)
library(mrdrivers)
library(magrittr)

# reed in source data
foodEmi <- readSource("EDGARfood", subtype="foodSystemSector")
foodBal <- calcOutput("FAOmassbalance", aggregate = F)

Process source data

# This chunk will generate a dataset where the observations are year-country combinations. 
# The features are greenhouse gas emissions of different food system sectors "GWP..." and FAO data food system metrics for the country "k..."


#interpolate the FAO data
foodBal <- time_interpolate(foodBal, seq(1990,2018) )


# aggregate food products to kcr kli kds k
# for a longer version of the abbreviations run: 
# View(read.csv(system.file("extdata", "reportingnames.csv", package = "magpiesets")))
# also "magpiesets\doc\magpiesets.html" in the R library folder is helpful
sets <- read.csv(system.file("extdata", "magpiesets.csv", package = "magpiesets"))
sets %>% select(all,kcr,kli,ksd,k) -> mapping
foodBal %>% toolAggregate(rel= slice(mapping, 1:44),from="all",to="kcr+kli+ksd+k" , 
                          dim = 3.1, partrel  =T) %>%
        .[,,"",invert=T] -> foodBal_p

# subset the data that will be used for the emission factors
foodBal_p <- foodBal_p[,,c("wm","dm","nr")][,,c("production","food", "milling" ,
                                                "refining", "extracting","fermentation",  
                                                "distilling", "waste", "domestic_supply", 
                                                "households")] 

# aggregate the processing items to processing category
map2 <- tibble(source= c("production", "food",       "milling",    "refining",   "extracting","fermentation", "distilling", 
                         "waste", "domestic_supply", "households")  , 
               tar =   c("production", "processing", "processing" ,"processing", "processing","processing",   "processing", 
                         "waste","domestic_supply", "households"))
foodBal_p2<-  toolAggregate(foodBal_p, map2, from="source", to="tar",dim=3.2)


# pivot the objects to wide format, such that observations are country-year combinations
pivot_wider(as.data.frame( foodBal_p2), id_cols = c("Year", "Region"), names_from = starts_with("Data") ,  values_from = "Value") ->datBal
pivot_wider(as.data.frame( foodEmi), id_cols = c("Year", "Region"), names_from = starts_with("Data") ,  values_from = "Value") ->datEmi

#join emission data with FAO data
data<- left_join(datEmi, datBal)

Add possible drivers/predictors to the data.

##########  add drivers to the data


### add extra information from foodBal/FAO as drivers

# add finer resolved processing and production items
foodBal[,,c("food","milling","refining","extracting"  ,"distilling") ][,,"dm"] %>% dimSums(.,dim=3.2) %>%  collapseDim() %>% as.data.frame() %>%  
  pivot_wider(names_from = Data1, values_from = "Value") -> driv_procc
foodBal[,,c("production") ][,,"dm"]  %>%  collapseDim() %>% as.data.frame() %>%  
  pivot_wider(names_from = Data1, values_from = "Value") -> driv_prod
driv_procc %>% rename_with( .cols = !(1:3) , .fn =  ~ paste0("driv_proc_", .x) ) -> driv_procc
driv_prod %>% rename_with( .cols = !(1:3) , .fn =  ~ paste0("driv_prod_", .x) ) -> driv_prod 
data<- left_join(data, driv_procc )
data<- left_join(data, driv_prod )


#add export as driver
exp <- dimSums(foodBal[,,"export"][,,"wm"], dim=3)
data<- left_join( data, select(as.data.frame( exp) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_ec_exp = Value)


### add drivers from other data sources in the mr universe

#read data
gdp <- calcOutput("GDPPast", aggregate = F)
pop <- calcOutput("PopulationPast", aggregate = F)
affl <- collapseDim(gdp)/collapseDim(pop)
urb <- calcOutput("UrbanPast", aggregate = F)
dS <- calcOutput("DevelopmentState", aggregate = F) # needs to be interpolated
dS <- time_interpolate(dS, 1990:2018)
govI <- calcOutput("GovernanceIndicator", aggregate = F) # only from 1995
govI <- time_interpolate(govI, 1990:2018)

#add data
data<- left_join( data, select(as.data.frame( affl) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_ec_aff = Value)
data<- left_join( data, select(as.data.frame( pop) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_ec_pop = Value)
data<- left_join( data, select(as.data.frame( gdp) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_ec_gdp = Value)
data<- left_join( data, select(as.data.frame( urb) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_ec_urb = Value)
data<- left_join( data, select(as.data.frame( govI[,,"SSP1"]) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_ec_govI = Value)
data<- left_join( data, select(as.data.frame( dS[,,"SSP1"]) , Year, Region, Value), 
                  by=c("Region","Year")) %>%  rename(driv_ec_dS = Value)



## add some informative data about regions and country names (for plotting)
regmap        <- toolGetMapping("regionmappingH12.csv")
data<- left_join(data, regmap %>% rename(Region=CountryCode) ,by="Region") %>%  
  rename( info_countrynames = X, info_largReg  = RegionCode)




### adding principal components since they capture the variance with less number of variables
# data %>% select(starts_with("k")) %>%  prcomp ->k_pca
# 
# pcas <- as.data.frame( k_pca$x[,1:12])
# 
# names(pcas) <- paste0("K_", names(pcas))
# 
# data <- cbind(data, pcas)

Remove data that un not usable for the regression.

### REMOVE ROWS
#sort out countries that are completely zero in fooBal
data %>% group_by(Region) %>% summarise(across(starts_with("k"), sum)) %>% rowwise %>%  
  mutate(sum= sum(across(starts_with("k") )) ) %>% filter(sum==0) %>%  pull(Region) -> sortout # zeros
data %<>% filter(! Region %in% sortout)

#remove rows with only NAs in foodBal columns #TODO: document which year_countries are removed
#data %>% mutate(sum= rowSums( across( starts_with("k"), function(x) is.na(x)  )  )) %>% filter(sum < ) %>% 
#  select(!sum) ->  data


### REMOVE COLS
#remove emission columns that are completely zero 
data %<>% select(
  !( contains("F-gases" )&!contains("Packaging")&!contains("Retail") )  #evaluates to FALSE when columns have F-gases but not Packaging or retail in their name
  ) 

#remove households_dm which is a completly zero data column
data %<>% select(!(contains("households_dm") )) 

# remove cell column  that isn't needed
data %<>% select(-Cell)

#remove any other numeric columns with absolute value that sums up to zero i.e. data that exists but is always zero
data %>% summarise(across(everything(), function(x) ifelse(is.numeric(x),  sum(abs(x), na.rm = TRUE)>0, TRUE) )) %>% unlist() ==0 -> zerocols

which(zerocols) # print them
## driv_proc_distillers_grain          driv_proc_ethanol 
##                         89                         90 
##         driv_proc_oilcakes           driv_proc_fibres 
##                         94                        109 
##            driv_proc_foddr             driv_proc_begr 
##                        111                        117 
##             driv_proc_betr          driv_proc_pasture 
##                        118                        119 
##      driv_proc_res_cereals      driv_proc_res_fibrous 
##                        120                        121 
##   driv_proc_res_nonfibrous              driv_proc_scp 
##                        122                        123 
##             driv_proc_wood         driv_proc_woodfuel 
##                        124                        125 
##             driv_prod_begr             driv_prod_betr 
##                        158                        159 
##              driv_prod_scp 
##                        164
data[!zerocols] ->data  #remove them

Choose denominators of emission factors

Emission factors assume a linear relationship between the emissions from a given source and a related (economic) activity. While the linear relation does not need to be the same for all countries and times, a good starting point to choose which emission to link to which activity might still be the global correlation coefficients.

### corellation plots
data %>% select(!Year&!Region &!starts_with("driv_")&!starts_with("inf")&!starts_with("K_PC") )-> datac

datac %>% drop_na() %>% cor() %>% 
  .[1:26,27:ncol(datac)] %>% t -> tem  

tem[order(str_extract(row.names(tem), "_.*_") ) , ] %>%  corrplot::corrplot( col.lim = c(-0.1,1), method = "number")

The correlation structure is less clear than one could hope for. For the production emissions it seems common-sensical to choose a production quantity as a denominator. “k_production_dm” seems to be a good first idea due to the high correlations.
For the processing emissions there is no single item with high correlations. “k_processing_dm” is chosen for N2O , “kli_processing_dm” for CO2 and “kcr_processing_wm” for CH4.

Regression analysis

As said the emission factors can vary for different times and regions. We are interested in finding the underlying drivers in order to predict emissions for different economic scenarios.

A simple linear regression model for the emission factors is

\[ emission_{g,c,y}/ m_{c,y} =a+b_{1}* d_{1,c,y}+b_2* d_{2,c,y} + b_3* d_{3,c,y}+ \cdots + \eta \] with \[\eta \sim N(0,\epsilon )\] g is the GHG, c is the country, y is the year, a is the intercept, \(b_i\) are the slopes, \(d_{i,c,y}\) are the drivers in the data and \(\eta\) is the normally distributed error.

Since we are interested in predicting the emissions and not the emission factor in the end it could make sense to change the model a bit by bringing the factor \(m_{c,y}\) to the RHS. The least squares estimates will minimize residual squares for the emissions and not the emission factors then.

\[ emission_{g,c,y} =a* m_{c,y}+ b_1* d_{1,c,y}* m_{c,y}+b_2* d_{2,c,y}*m_{c,y} + b_3* d_{3,c,y}*m_{c,y}+ \cdots + \gamma \] with \[\gamma \sim N(0,\delta )\]

In the following the two options with total emissions as dependent variable and with emission factors as depended variable will both be explored.

It should be mentioned that a standard assumption of this regression model is violated in our case: The observations \(emission_{g,c,y}\) \(m_{c,y}\) \(d_{i,c,y}\) for different values of c and y are not independent. Hence the results, especially the significance of influence of drivers, need to be treated with caution. A way of avoiding this assumption, would be the use of mixed effect models. However, to get a first idea of the drivers the standard model is used.

The following linear models where build by successive elimination of independent variables, mainly based on p-values. Further olsrr a tool for automatic selection of independent variables was used once the number of variables was small enough (i.e. below 11).

To facilitate the elimination process the following function elimHelper is used.

#lR         < data frame; basis for the linear model. Column "emiFac" is the dependent variable. All other variables will be treated as independend variables.
#withRval   < Boolean;    show R values in addition to p values?
#auto_elim  < integer;    automatically eliminate variables with highes p-value until auto_elim variables are left 
#weight     < vector;     true use vec as a weight in regression
elimHelper <- function(lR, withRval, auto_elim, weight) {
  useweight= !is.null(weight)
  
  
  # automatically eliminate the values with highest p value unit cap of auto_elim is reached
  if(ncol(lR)>auto_elim)
  {
  
  while(ncol(lR)>auto_elim){
    if(useweight){ model <- lm( emiFac ~ . , data = lR, weights = weight)}
    else{ model <- lm( emiFac ~ . , data = lR)}
    
    p<-summary(model)$coefficients[,4]
    p<- p[!names(p) %in% "(Intercept)"]
    elim<-which.max(p)
    if(names(elim) %in% colnames(lR)){
      lR %<>% select(!names(elim))
    }
    else{
      break
    }
  }
    return(lR)
    
  }
  
  
  # main elimination loop
  flag=T # will be set to FALSE one "stop" is typed in
  while(flag)
  {
    
    # set up model and output r squared
    #browser()
    if(useweight){ model <- lm( emiFac ~ . , data = lR, weights = weight)}
    else{ model <- lm( emiFac ~ . , data = lR)}
    print(summary(model)$r.squared)
    # print p-values
    p<-summary(model)$coefficients[,4]  
    print("\n p-value")
    print(sort(p))

    #print R2-values if withRval=TRUE
    if(withRval){ 
      f=NULL
      cN <- colnames(lR)
      for (n in cN[2:length(cN)]){
        lR2 <- lR %>% select( -n)
        if(useweight){ model2 <- lm( emiFac ~ . , data = lR, weights = weight)}
        else{ model2 <- lm( emiFac ~ . , data = lR)}       
        r2 <- summary(model2)$r.squared
        add <- setNames( r2,n)
        f <- c(f, add)
      }
      print("\n r-change")
      print(sort(f))
    }
    
    #nested loop to determine the next variabe that is to be eliminated based on user input
    flag2=TRUE #flag for loop to ask for the correct variable to be eliminated
    while(flag2){
      a=readline()

      
      if(a=="stop"){
        flag<-FALSE
        flag2<-FALSE
      }
      else{
        print(lR %>% select(matches(a)) %>% names())
        b=readline()
        if(b=="o"){             # confirm the variable with "o" for "ok"
          flag2<-FALSE          # the correct next variable to eliminate was confirmed
          lR %<>% select(!matches(a))
        }
        
      }
    }
  }
  return(lR)
}

Processing

N2O

1. Model; Emission fac as depend var, not weighted

Plot the distribution of emission factors and get information about possible outliers.

#### N2O processing emissions
# best denominator for emission factors: k_processing_dm , alternatively kli_processing_dm

# add depended variable  
datN2OProc <- data %>% filter(k_processing_dm!=0)  %>%  
  mutate(emiFac=GWP_100_N2O_Processing/k_processing_dm)

hist(datN2OProc$emiFac,100)

#VIEW EXTREME Emission Facs
datN2OProc %>% filter(emiFac<0 | emiFac>150) %>% select(emiFac, GWP_100_N2O_Processing, k_processing_dm, Region, info_countrynames ,Year) %>% DT::datatable(.)

Build model

# select all drivers to start the elimination process
datN2OProc_Model <- datN2OProc %>%  select(emiFac,starts_with(c("driv_ec","driv_proc","driv_prod", "ksd", "kli", "kcr") ) ) 

# best predictors where found by successive elimination of variables with these commands:
# lR_mreduced<-elimHelper_proc(datN2OProc)
# subs <- olsrr::ols_step_all_possible(m)

# set up a model with best resulting predictors
m<- lm( emiFac ~ driv_ec_exp + driv_ec_govI + driv_ec_dS, datN2OProc ) 

# add the fitted values for the emission factors and entailing predictions for the emissions to the data
datN2OProc %>% mutate(fitted= m$fitted.values) -> datN2OProc

summary(m)
## 
## Call:
## lm(formula = emiFac ~ driv_ec_exp + driv_ec_govI + driv_ec_dS, 
##     data = datN2OProc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.890  -1.617  -0.064   1.436 187.740 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.955173   0.261787  18.928  < 2e-16 ***
## driv_ec_exp  -0.041080   0.004618  -8.895  < 2e-16 ***
## driv_ec_govI  4.587137   0.600130   7.644 2.52e-14 ***
## driv_ec_dS    2.762052   0.280347   9.852  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.858 on 4984 degrees of freedom
## Multiple R-squared:  0.07636,    Adjusted R-squared:  0.07581 
## F-statistic: 137.4 on 3 and 4984 DF,  p-value: < 2.2e-16
#plot predicted emissions against true emissions

p<- ggplot(datN2OProc , 
                  # new model for emission factors
           aes(x= (fitted)*
                                k_processing_dm
                              , y= GWP_100_N2O_Processing
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

2. Model; Emission fac as depend var, weighted

# select all drivers to start the elimination process

# best predictors where found by successive elimination of variables with these commands:
 weight= datN2OProc_Model$driv_ec_pop

#  lR_mreduced<-elimHelper(datN2OProc_Model, T,50,weight)
#  
#  #m<- lm( emiFac ~ ., weights = weight , data = lR_mreduced ) 
#  
#  #summary(m)
#  subs <- leaps::regsubsets(emiFac ~ ., lR_mreduced, weights=weight, nbest=3)
#  
#  subsSum<-summary(subs)
#  compSubsets<- data.frame(
#                 Adj.R2 = subsSum$adjr2,
#                 driv = subsSum$which
#                 )
# #decide for a model size and look at the best model
# model<- compSubsets["X3",]
# model[which( model==TRUE)]

# set up a model with best resulting predictors
m<- lm( emiFac ~ driv_ec_aff+ksd_production_nr+kli_domestic_supply_nr, weights = weight, datN2OProc ) 
summary(m)
## 
## Call:
## lm(formula = emiFac ~ driv_ec_aff + ksd_production_nr + kli_domestic_supply_nr, 
##     data = datN2OProc, weights = weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -49.092  -2.923   0.708   5.346 223.228 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             7.141e+00  4.326e-02  165.06   <2e-16 ***
## driv_ec_aff             6.769e-05  2.479e-06   27.30   <2e-16 ***
## ksd_production_nr      -2.574e+00  6.003e-02  -42.88   <2e-16 ***
## kli_domestic_supply_nr  5.690e+00  1.265e-01   44.98   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.7 on 4984 degrees of freedom
## Multiple R-squared:  0.3111, Adjusted R-squared:  0.3107 
## F-statistic: 750.2 on 3 and 4984 DF,  p-value: < 2.2e-16
# add the fitted values for the emission factors and entailing predictions for the emissions to the data
datN2OProc %>% mutate(fitted= m$fitted.values) -> datN2OProc
#plot predicted emissions against true emissions

p<- ggplot(datN2OProc , 
                  # new model for emission factors
           aes(x= (fitted)*
                                k_processing_dm
                              , y= GWP_100_N2O_Processing
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

3. Model; Total emission as depend var, not weighted

#plot predicted against true emissions with the old method (for comparison)

p<- ggplot(
  data #%>% filter(Year=="2000" )
                      , aes(x=(8.716686       + driv_ec_pop          * 0.004280      +  k_processing_nr   *2.2303566  + kcr_processing_nr* -3.3071144 )*
                              k_processing_dm
                              , y= GWP_100_N2O_Processing
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

Plot total emissions against total GDP

#plot predicted against true emissions with the old method (for comparison)

p<- ggplot(
  datN2OProc #%>% filter(Year=="2000" )
                      , aes(x= GWP_100_N2O_Processing
                              , y= driv_ec_gdp
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

Plot emission factor against per capita GDP

#plot predicted against true emissions with the old method (for comparison)

p<- ggplot(
  datN2OProc #%>% filter(Year=="2000" )
                      , aes(x= emiFac
                              , y= driv_ec_aff
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

CO2

1. Model; Emission fac as depend var, not weighted

Plot the distribution of emission factors and get information about possible outliers.

#Canada, Australia and New Zealand: CAZ; China: CHA; European Union: EUR; India: IND; Japan: JPN; Latin America: LAM; Middle East and North Africa: MEA; non-EU member states: NEU; other Asia: OAS; reforming countries: REF; sub-Saharan Africa: SSA; United States: USA

#### CO2 processing emissions
 #data$driv_exp

# setup data
data %>% filter(kli_processing_wm!=0) %>%  mutate(emiFac= GWP_100_CO2_Processing/kli_processing_nr) -> datCO2Proc

hist(datCO2Proc$emiFac,100)

#VIEW EXTREME Emission Facs
datCO2Proc %>% filter(emiFac<0 | emiFac>150000) %>% select(emiFac, GWP_100_CO2_Processing, kli_processing_nr, Region, info_countrynames ,Year) %>% DT::datatable(.)
# setup data for building the model i.e. select possible drivers
datCO2Proc_model <- datCO2Proc %>%  select(emiFac, starts_with(c("driv_ec","driv_proc","driv_prod", "ksd", "kli", "kcr") ) )

Build a model with emission factors as dependent variable.

# get final model
model_final <- lm( emiFac ~ driv_ec_dS + ksd_households_nr,  datCO2Proc)

summary(model_final)
## 
## Call:
## lm(formula = emiFac ~ driv_ec_dS + ksd_households_nr, data = datCO2Proc)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38104 -12449  -2324   3908 204453 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          544.7      555.4   0.981    0.327    
## driv_ec_dS         35598.3      807.5  44.084   <2e-16 ***
## ksd_households_nr  85162.7     7604.8  11.199   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22330 on 4985 degrees of freedom
## Multiple R-squared:  0.2912, Adjusted R-squared:  0.2909 
## F-statistic:  1024 on 2 and 4985 DF,  p-value: < 2.2e-16
datCO2Proc %>% mutate(fitted2= model_final$fitted.values) ->datCO2Proc
#plot predicted emissions against true emissions

                              #new model for emission factors
p<- ggplot(datCO2Proc, aes(x= (544.7+ 35598.3 *driv_ec_dS        + 85162.7   *ksd_households_nr  )* 
                             kli_processing_nr, 
                           y= GWP_100_CO2_Processing, 
                           label = Region, Year= Year, country= info_countrynames  
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

2. Model; Emission fac as depend var, weighted

# select all drivers to start the elimination process

# best predictors where found by successive elimination of variables with these commands:
 weight= datCO2Proc_model$driv_ec_pop

#  lR_mreduced<-elimHelper(datCO2Proc_model, T,50,weight)
#  
#  #m<- lm( emiFac ~ ., weights = weight , data = lR_mreduced ) 
#  
#  #summary(m)
#  subs <- leaps::regsubsets(emiFac ~ ., lR_mreduced, weights=weight, nbest=3)
#  
#  subsSum<-summary(subs)
#  compSubsets<- data.frame(
#                 Adj.R2 = subsSum$adjr2,
#                 driv = subsSum$which
#                 )
# #decide for a model size and look at the best model
# model<- compSubsets["X3",]
# model[which( model==TRUE)]

# set up a model with best resulting predictors
m<- lm( emiFac ~ driv_ec_dS+driv_prod_brans+kli_waste_nr, weights = weight, datCO2Proc ) 
summary(m)
## 
## Call:
## lm(formula = emiFac ~ driv_ec_dS + driv_prod_brans + kli_waste_nr, 
##     data = datCO2Proc, weights = weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -490943  -26254   -9341    5300  768309 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      1.938e+03  4.527e+02   4.281  1.9e-05 ***
## driv_ec_dS       3.605e+04  5.997e+02  60.120  < 2e-16 ***
## driv_prod_brans  2.770e+03  3.336e+01  83.048  < 2e-16 ***
## kli_waste_nr    -1.663e+06  2.921e+04 -56.923  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 92530 on 4984 degrees of freedom
## Multiple R-squared:  0.6499, Adjusted R-squared:  0.6497 
## F-statistic:  3084 on 3 and 4984 DF,  p-value: < 2.2e-16
# add the fitted values for the emission factors and entailing predictions for the emissions to the data
datCO2Proc %>% mutate(fitted= m$fitted.values) -> datCO2Proc
#plot predicted emissions against true emissions

p<- ggplot(datCO2Proc , 
                  # new model for emission factors
           aes(x= (fitted)*
                                kli_processing_nr
                              , y= GWP_100_CO2_Processing
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

3. Model; Total emissions as depend var, not weighted

data %>% filter(kli_processing_wm!=0) %>%  mutate(emiFac= GWP_100_CO2_Processing/kli_processing_nr) -> lR

lR_m <- lR %>%  select(emiFac, starts_with(c("driv_ec","driv_proc","driv_prod", "ksd", "kli", "kcr") ) ) 

                              #old model for emission factors
p2<- ggplot(data  , aes(  x=  (14614.71 + driv_proc_brans *1651.60 + driv_proc_sugr_beet  *5740.25 )* 
                              kli_processing_nr   , 
                          y= GWP_100_CO2_Processing, label = Region, Year= Year   , 
                          country = info_countrynames
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))

Plot total emissions against total GDP

#plot predicted against true emissions with the old method (for comparison)

p<- ggplot(
  datCO2Proc #%>% filter(Year=="2000" )
                      , aes(x= GWP_100_CO2_Processing
                              , y= driv_ec_gdp
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

Plot emission factor against per capita GDP

#plot predicted against true emissions with the old method (for comparison)

p<- ggplot(
  datCO2Proc #%>% filter(Year=="2000" )
                      , aes(x= emiFac
                              , y= driv_ec_aff
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

CH4

1. Model; Emission factors as depend var, not weighted

Plot the distribution of emission factors and get information about possible outliers.

data %>% filter(kcr_processing_wm!=0) %>%  mutate(emiFac= GWP_100_CH4_Processing/kcr_processing_wm) -> datCH4Proc

#datCH4Proc %>% filter(!(emiFac>0 & emiFac<2000))

#datCH4Proc %<>% filter(emiFac>0 & emiFac<500) 

### VIEW NEGATIVE emiFac VALUES
datCH4Proc %>% filter(emiFac< 0|emiFac>3000 ) %>% select(emiFac, GWP_100_CH4_Processing, kcr_processing_wm, Region, Year) %>% DT::datatable(.)
hist(datCH4Proc$emiFac,1000)

Since there are clearly outlier with extremely negative emission factors, I decided to filter those out first.

datCH4Proc %<>% filter(!emiFac< 0, emiFac<3000 ) 

Built the model

datCH4Proc_model <- datCH4Proc %>%  select(emiFac, starts_with(c("driv_ec","driv_proc","driv_prod", "ksd", "kli", "kcr") ) ) 

#datCH4Proc_model_red <- elimHelper(datCH4Proc_model, F, 20)
#datCH4Proc_model_red <- elimHelper(datCH4Proc_model_red, T, 20)

# ... 

#m<- lm(emiFac ~ ., datCH4Proc_model_red)

#k<-olsrr::ols_step_all_possible(m)

model<- lm(emiFac ~ driv_ec_aff  , datCH4Proc)

datCH4Proc %>% mutate(fitted= model$fitted.values) %>% mutate(predicted= fitted*kcr_processing_wm ) ->datCH4Proc
summary(model)
## 
## Call:
## lm(formula = emiFac ~ driv_ec_aff, data = datCH4Proc)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -365.20  -26.91   -8.51   -1.55 2199.09 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.734e+00  2.034e+00   1.836   0.0664 .  
## driv_ec_aff 2.959e-03  9.645e-05  30.676   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 109.3 on 4970 degrees of freedom
## Multiple R-squared:  0.1592, Adjusted R-squared:  0.159 
## F-statistic:   941 on 1 and 4970 DF,  p-value: < 2.2e-16
#plot predicted emissions against true emissions

p<- ggplot(
  datCH4Proc
                      , aes(x= (4.2339802  + driv_ec_aff* 0.0031080)*
                              kcr_processing_wm
                              , y= GWP_100_CH4_Processing
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

2. Model; Emission factors as depend var, weighted

# select all drivers to start the elimination process

# best predictors where found by successive elimination of variables with these commands:
 weight= datCH4Proc_model$driv_ec_pop

# lR_mreduced<-elimHelper(datCH4Proc_model, T,50,weight)
 
 #m<- lm( emiFac ~ ., weights = weight , data = lR_mreduced ) 
 
 #summary(m)
 #subs <- leaps::regsubsets(emiFac ~ ., lR_mreduced, weights=weight, nbest=3)
 
 #subsSum<-summary(subs)
 #compSubsets<- data.frame(
#                Adj.R2 = subsSum$adjr2,
#                driv = subsSum$which
#                )
#decide for a model size and look at the best model
#model<- compSubsets["X3",]
#model[which( model==TRUE)]

# set up a model with best resulting predictors
m<- lm( emiFac ~ driv_ec_exp+driv_ec_aff+driv_ec_urb, weights = weight, datCH4Proc ) 
summary(m)
## 
## Call:
## lm(formula = emiFac ~ driv_ec_exp + driv_ec_aff + driv_ec_urb, 
##     data = datCH4Proc, weights = weight)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -548.23  -45.31  -19.66    3.21 2670.27 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.854e+00  1.459e+00  -1.271    0.204    
## driv_ec_exp -1.508e-01  1.583e-02  -9.521   <2e-16 ***
## driv_ec_aff  7.059e-04  7.766e-05   9.090   <2e-16 ***
## driv_ec_urb  4.230e+01  3.697e+00  11.443   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 219 on 4968 degrees of freedom
## Multiple R-squared:   0.13,  Adjusted R-squared:  0.1295 
## F-statistic: 247.4 on 3 and 4968 DF,  p-value: < 2.2e-16
# add the fitted values for the emission factors and entailing predictions for the emissions to the data
datCH4Proc %>% mutate(fitted= m$fitted.values) -> datCH4Proc
#plot predicted emissions against true emissions

p<- ggplot(datCH4Proc , 
                  # new model for emission factors
           aes(x= (fitted)*
                                kcr_processing_wm
                              , y= GWP_100_CH4_Processing
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

3. Model; Total emissions as depend var, not weighted

p2<- ggplot(
  data #%>% filter(Region %in% c("BRA", "IDN", "IND") )
                      , aes(  x=    (37.4412095 + 
                                driv_ec_pop      *       -0.0289276  + 
                                driv_proc_others * 0.2461490)*kcr_processing_wm,
                              y= GWP_100_CH4_Processing, label = Region, Year= Year ,
                              country = info_countrynames 
                            #color = -4.212e+02 * driv_dS + driv_aff   *  5.742e-03 + driv_govI  *  1.019e+03
                            ))+ 
  geom_point(
    aes( ) 
    )+ 
  #geom_text(aes(label = Region))+
  geom_smooth(method='lm', formula= y~x, aes(group=1)) 

plotly::ggplotly(p2, tooltip = c("y", "Year", "country"))

Plot total emissions against total GDP

#plot predicted against true emissions with the old method (for comparison)

p<- ggplot(
  datCH4Proc #%>% filter(Year=="2000" )
                      , aes(x= GWP_100_CH4_Processing
                              , y= driv_ec_gdp
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))

Plot emission factor against per capita GDP

#plot predicted against true emissions with the old method (for comparison)

p<- ggplot(
  datCH4Proc #%>% filter(Year=="2000" )
                      , aes(x= emiFac
                              , y= driv_ec_aff
                              , label = Region, Year= Year, country= info_countrynames #, 
                            ))+ 
  geom_point()+ 
  geom_smooth(method='lm', formula= y~x, aes(group=1))

plotly::ggplotly(p, tooltip = c("y", "Year", "country"))